
########
## SETUP
########

knitr::opts_chunk$set(echo = TRUE)
rm(list=ls()) 
set.seed(847509348)

## packages - check if installed and load
packages <- c("haven", "grf", "dplyr", "caret", "fastDummies")

package.check <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x, dependencies = TRUE)
    library(x, character.only = TRUE)
  }
})

 username <- Sys.getenv("USERNAME")
 dta_wd <- paste0("C:/Users/", username, "/Dropbox/CFPB SLCCU Credit Matters Loan Evaluation/CBL_replication_package/03_dta/04_analysis_dtas")
 results_wd <- paste0("C:/Users/", username, "/Dropbox/CFPB SLCCU Credit Matters Loan Evaluation/CBL_replication_package/04_tables/02_individual_tables")
 dta <- read_dta(paste0(dta_wd,"/cf scoredfnumlns.dta"))

# To Run in the server
#dta_wd <- "/home/ekp4095/CBL/data"
#results_wd <- "/home/ekp4095/CBL/results"
#dta <- read_dta(paste0(dta_wd,"/cf scoredfnumlns.dta"))


## Directories - CHANGE!
for (e in 1:3) {

    ###########
    ## Cleaning
    ###########
    
    ## Converting categoricals to dummies?
    dta[,sapply(dta, class) == "haven_labelled"] = 
      lapply(dta[,sapply(dta, class) == "haven_labelled"], as.numeric)
    
    cov <- c(           'svymiss',
                        'creditmiss',
                        'age',
                        'female',
                        'race_black',
                        'married',
                        'adults',
                        'child',
                        'college',
                        'inclt30',
                        'z_insecurity_i',
                        'z_selfcont_i',
                        'z_risk_i',
                        'z_credstatus_i',
                        'z_credknow_i',
                        'tradelines1',
                        'savingsbal_mehb95',
                        'savcheck_mehb95',
                        'slccumemb',
                        'binnoncblslcb',
                        'z_default_index_all_ib',
                        'z_newcredit_ib',
                        'z_liquid_ib')
    
    idvar <-            'surveyid'
    treatment_var <-    'enc'
    outcome_var   <-    'scoredf'
    
    # remove missing outcomes
    clean_dta <- dta[!is.na(dta$scoredf),]
    period <- paste("endline", e, sep="")
    assign("clean_dta", clean_dta %>% filter(!!as.name(period)==1))
    
    ####################
    # Variable selection
    ####################
    
    ## Outcomes of interest
    Y <- as.numeric(clean_dta$scoredf)
    
    ## Treatment variable
    W <- as.integer(clean_dta$enc) 
    
    ## Vector of covariates 
    X <- clean_dta[,c(cov)]
    
    ## Cluster variable
    cluster <- as.integer(clean_dta$surveyid)
    
    #################
    ## Causal forests
    #################
    
    ## We will test 3 models, increasing the number of trees in each one
    
    ## Defining different seeds for tests
    seeds <- c(746) ## 3 seeds at least
    
    ## defining different number of trees - CHANGE!
    num_trees <- c(50000) ## For running in the server

    ## Looping through different number of trees
    for(t in 1:length(num_trees)){
      
      ## Looping through different seeds
      for(j in 1:length(seeds)){
        
        ## First estimation of Y and W to use in models
        Y_forest = regression_forest(X, Y, seed = seeds[j], clusters = cluster)
        Y_hat = predict(Y_forest)$predictions
        W_forest = regression_forest(X, W, seed = seeds[j], clusters = cluster)
        W_hat = predict(W_forest)$predictions
        
        
        ## 1) Selecting variables with high importance
        ## Running forest
        set.seed(seeds[j])
        cf <- causal_forest(X, Y, W,
                            Y.hat = Y_hat, W.hat = W_hat,
                            clusters = cluster,
                            equalize.cluster.weights = FALSE, 
                            seed = seeds[j])
        
        ## Select variables with percentage importance above mean and save in new vector
        x_cf1 <- variable_importance(cf)
        X_selected <- which(variable_importance(cf)>mean(variable_importance(cf)))
        
        ## 2) Model with variables selected in 1
        ## Running forest and save in temp object
        set.seed(seeds[j])
        cf <- causal_forest(X[, X_selected], Y, W,
                            Y.hat = Y_hat, W.hat = W_hat,
                            num.trees = num_trees[t],
                            seed = seeds[j],
                            honesty = TRUE,
                            stabilize.splits = TRUE,
                            clusters = cluster,
                            equalize.cluster.weights = FALSE, 
                            tune.parameters = "none")
        
        ## Save forest in object with new name
        assign(paste("cf", paste("t", t, sep=""), paste("s", j, sep=""), sep ="_"), cf)
        
        ## Predicted HTE (using out of bag predictions)
        pred_temp <- predict(cf, estimate.variance = TRUE)
        pred_temp_q <- ntile(pred_temp$predictions, 10)
        pred_temp_q5 <- ntile(pred_temp$predictions, 5)
        pred_temp_q6 <- ntile(pred_temp$predictions, 6)
        pred_temp_q3 <- ntile(pred_temp$predictions, 3)
        
        ## ATE by sextiles
        ate_q_temp <- lapply(
          seq(5), function(w) {
            ate <- average_treatment_effect(cf, subset = pred_temp_q5 == w)
          })
        ## ATE by sextiles
        ate_s_temp <- lapply(
          seq(6), function(w) {
            ate <- average_treatment_effect(cf, subset = pred_temp_q6 == w)
          })
        ## ATE by terciles
        ate_t_temp <- lapply(
          seq(3), function(w) {
            ate <- average_treatment_effect(cf, subset = pred_temp_q3 == w)
          })
        
        ## Create vectors for var importance with correct dimensions for merging
        varimp <-variable_importance(cf) #weighted sum of how many times feature i was split on at each depth in the forest
        len <- length(names(X)) - length(variable_importance(cf))
        varimp <- append(varimp, rep(NA, len), after = len) #add NAs for all of the other X variables that are not in the importance vector
        names_varimp <- names(X[X_selected]) #pull the names of the important variables 
        names_varimp <- append(names_varimp, rep(NA, len), after = len) #again add NAs for the names of the rest; 
        
        if (j==1 & t==1) {
          ## Predicted HTE (tau hat)
          HTE_pred_df <- clean_dta
          HTE_pred_df[,"y_hat"] <- Y_hat
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep ="_")]] <- pred_temp$predictions
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep ="_")]] <- pred_temp$variance.estimates
          ## Percetage of importance of each x in models
          var_imp_df <- data.frame(varimp, names_varimp) ## Empty data frame first time
          colnames(var_imp_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "perc", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "var", sep="_"))
          
          ## ATE by quintiles
          ATE_q_df <-data.frame(do.call(rbind, ate_q_temp))
          colnames(ATE_q_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "ateq", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "seq", sep="_"))
          
          ATE_s_df <-data.frame(do.call(rbind, ate_s_temp))
          colnames(ATE_s_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "ates", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "ses", sep="_"))
          
          ATE_t_df <-data.frame(do.call(rbind, ate_t_temp))
          colnames(ATE_t_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "atet", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "set", sep="_"))
          
          
          ## Test for heterogeity
          test_cal_df <- data.frame(test_calibration(cf)[2], test_calibration(cf)[4], test_calibration(cf)[6], test_calibration(cf)[8], 
                                    test_calibration(cf)[1], test_calibration(cf)[3], test_calibration(cf)[5], test_calibration(cf)[7], nrow(cf$predictions)) ## Empty data frame first time
          #colnames(test_cal_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "test", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "pval", sep="_"))
          colnames(test_cal_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep="_"), 
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "tval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "pval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpred", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestse", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "N", sep="_"))     
        }
        else {
          ## Predicted HTE (tau hat)
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep ="_")]] <- pred_temp$predictions
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep ="_")]] <- pred_temp$variance.estimates
          ## Percetage of importance of each x in models
          var_imp_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "perc", sep ="_")]] <- varimp
          var_imp_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "var", sep ="_")]] <- names_varimp
          ## ATE by quintiles
          ATE_q_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "ateq", sep ="_")]] <- do.call(rbind, ate_q_temp)[, 1]
          ATE_q_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "seq", sep ="_")]] <- do.call(rbind, ate_q_temp)[, 2]
          ATE_s_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "ates", sep ="_")]] <- do.call(rbind, ate_s_temp)[, 1]
          ATE_s_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "ses", sep ="_")]] <- do.call(rbind, ate_s_temp)[, 2]
          ATE_t_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "atet", sep ="_")]] <- do.call(rbind, ate_t_temp)[, 1]
          ATE_t_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "set", sep ="_")]] <- do.call(rbind, ate_t_temp)[, 2]
          ## Test for heterogeity
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep ="_")]] <- test_calibration(cf)[2]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep ="_")]] <- test_calibration(cf)[4]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "tval", sep ="_")]] <- test_calibration(cf)[6]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pval", sep ="_")]] <- test_calibration(cf)[8]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpred", sep ="_")]] <- test_calibration(cf)[1]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestse", sep ="_")]] <- test_calibration(cf)[3]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforesttval", sep ="_")]] <- test_calibration(cf)[5]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpval", sep ="_")]] <- test_calibration(cf)[7]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "N", sep ="_")]] <- nrow(cf$predictions)
        }
        ## Deciles for predicted HTE
        HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "q3", sep ="_")]] <- pred_temp_q3
      }
    }

    ## Save data frames into csv
    write.csv(HTE_pred_df, paste(results_wd, paste("/HTE_scoredf_numloans_e",e, "_df", sep=""), ".csv", sep = ""), row.names = FALSE)
    write.csv(var_imp_df, paste(results_wd, paste("/varimp_scoredf_numloans_e",e, "_df", sep=""), ".csv", sep = ""), row.names = FALSE)
    write.csv(test_cal_df, paste(results_wd, paste("/testcal_scoredf_numloans_e",e, "_df", sep=""), ".csv", sep = ""), row.names = FALSE)
    write.csv(ATE_q_df, paste(results_wd, paste("/ATEq_scoredf_numloans_e", e,  "_df", sep="_"), ".csv", sep = ""), row.names = FALSE)
    write.csv(ATE_s_df, paste(results_wd, paste("/ATEs_scoredf_numloans_e", e,  "_df", sep="_"), ".csv", sep = ""), row.names = FALSE)
    write.csv(ATE_t_df, paste(results_wd, paste("/ATEt_scoredf_numloans_e", e,  "_df", sep="_"), ".csv", sep = ""), row.names = FALSE)
    
    ## Cleaning workspace 
    #rm(cf, pred_temp_q, ate_q_temp, ate_s_temp, ate_t_temp)
    
    ###################
    ## Saving workspace
    ###################
    
    #save.image(paste(results_wd,"/causalforestscoredfnumloans_6mos.RData", sep = ""))
}